<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>ArXiv CS.CV Papers (Image/Video Generation) - April 30, 2025</title>
    <script src="https://cdn.tailwindcss.com"></script>
    <script src="https://cdnjs.cloudflare.com/ajax/libs/framer-motion/10.16.4/framer-motion.dev.js"></script>
    <!-- Example using Font Awesome (replace with your preferred icon library if needed) -->
    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/all.min.css">
    <style>
        @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700&display=swap');

        :root {
            /* New Palette: Light, Clean, Futuristic with Teal/Aqua accents */
            --bg-color: #f8fafc; /* Tailwind slate-50 (Very Light Gray) */
            --card-bg-color: #ffffff; /* White */
            --text-color: #1e293b; /* Tailwind slate-800 (Dark Gray-Blue) */
            --text-muted-color: #64748b; /* Tailwind slate-500 (Medium Gray-Blue) */
            --header-color: #0f172a; /* Tailwind slate-900 (Very Dark Blue) */
            --highlight-primary: #14b8a6; /* Tailwind teal-500 */
            --highlight-secondary: #67e8f9; /* Tailwind cyan-300 */
            --border-color: #e2e8f0; /* Tailwind slate-200 (Light Gray) */
            --shadow-color: rgba(15, 23, 42, 0.08); /* Subtle shadow based on slate-900 */
        }

        body {
            background-color: var(--bg-color);
            color: var(--text-color);
            font-family: 'Inter', sans-serif;
            overflow-x: hidden; /* Prevent horizontal scroll */
            line-height: 1.6;
        }

        .bento-grid {
            display: grid;
            gap: 1.5rem; /* Tailwind gap-6 */
            grid-template-columns: 1fr; /* Force single column */
            padding-bottom: 4rem; /* Add padding at the bottom */
        }

        .bento-item {
            /* Apply semi-transparent white background and blur */
            background-color: rgba(255, 255, 255, 0.7); /* White with 70% opacity */
            backdrop-filter: blur(10px); /* Apply blur effect */
            -webkit-backdrop-filter: blur(10px); /* Safari prefix */
            border-radius: 1rem; /* Slightly larger radius */
            padding: 1.75rem; /* Slightly more padding */
            border: 1px solid rgba(226, 232, 240, 0.5); /* Lighter border with transparency */
            box-shadow: 0 4px 12px var(--shadow-color);
            transition: transform 0.3s ease-out, box-shadow 0.3s ease-out, background-color 0.3s ease-out;
            overflow: hidden; /* Ensure content doesn't overflow */
            position: relative; /* For potential pseudo-elements */
        }

        /* Removed ::before pseudo-element for a cleaner look */


        .bento-item:hover {
            transform: translateY(-6px);
            box-shadow: 0 10px 20px var(--shadow-color), 0 4px 8px rgba(15, 23, 42, 0.06); /* Adjusted hover shadow */
        }

        .paper-title {
            font-size: 1.125rem; /* Tailwind text-lg */
            font-weight: 600; /* Tailwind font-semibold */
            color: var(--highlight-primary); /* Use new primary highlight */
            margin-bottom: 0.75rem; /* Tailwind mb-3 */
            line-height: 1.4;
        }

        .paper-summary {
            font-size: 0.875rem; /* Tailwind text-sm */
            color: var(--text-muted-color);
            margin-bottom: 1.25rem; /* Tailwind mb-5 */
            line-height: 1.6;
        }

        .paper-link {
            display: inline-flex; /* Use flex for icon alignment */
            align-items: center;
            font-size: 0.875rem; /* Tailwind text-sm */
            font-weight: 600;
            color: var(--highlight-primary);
            text-decoration: none;
            padding: 0.5rem 1rem; /* Add padding */
            border-radius: 0.5rem; /* Slightly rounder */
            background-color: rgba(20, 184, 166, 0.08); /* Subtle teal background */
            border: 1px solid rgba(20, 184, 166, 0.2);
            transition: background-color 0.3s ease, color 0.3s ease, transform 0.2s ease;
        }

        .paper-link i {
            margin-right: 0.5rem; /* Tailwind mr-2 */
            transition: transform 0.3s ease;
        }

        .paper-link:hover {
            background-color: rgba(20, 184, 166, 0.15);
            color: #0d9488; /* Darker teal on hover */
            transform: translateY(-1px);
        }
        .paper-link:hover i {
             transform: translateX(2px);
        }

        .paper-authors {
            font-size: 0.75rem; /* Tailwind text-xs */
            color: var(--text-muted-color);
            margin-top: 1rem; /* Tailwind mt-4 */
            font-style: italic;
        }

        .header {
            text-align: center;
            margin-bottom: 3rem; /* Tailwind mb-12 */
            padding-top: 3rem; /* Tailwind pt-12 */
        }

        .header h1 {
            font-size: 2.5rem; /* Tailwind text-4xl or 5xl */
            font-weight: 700; /* Tailwind font-bold */
            color: var(--header-color);
            letter-spacing: -0.025em; /* Tailwind tracking-tight */
            margin-bottom: 0.5rem;
            /* Optional: Add a subtle text gradient */
            /* background: linear-gradient(90deg, var(--highlight-primary), var(--highlight-secondary)); */
            /* -webkit-background-clip: text; */
            /* -webkit-text-fill-color: transparent; */
        }

        .header p {
            font-size: 1.125rem; /* Tailwind text-lg */
            color: var(--text-muted-color);
            margin-top: 0.5rem; /* Tailwind mt-2 */
            max-width: 600px;
            margin-left: auto;
            margin-right: auto;
        }

        .footer {
            text-align: center;
            color: var(--text-muted-color);
            font-size: 0.875rem; /* Tailwind text-sm */
            padding-top: 2rem;
            padding-bottom: 2rem; /* Tailwind py-8 */
            border-top: 1px solid var(--border-color);
            margin-top: 4rem;
        }

        /* Simple line graphic element (optional) */
        .line-graphic {
            height: 1px; /* Thinner line */
            background: linear-gradient(90deg, rgba(20, 184, 166, 0), var(--highlight-primary), rgba(20, 184, 166, 0));
            opacity: 0.6;
            margin: 1.5rem 0; /* Adjust margin */
        }

        /* Framer Motion requires the script, styles enhance appearance */
        [data-motion-element] {
             /* Base styles for elements animated by Framer Motion */
        }

        .paper-tldr {
            font-size: 0.95rem; /* Slightly bigger than summary */
            color: #475569; /* Changed to Tailwind slate-600 (slightly darker than summary) */
            margin-top: 0.75rem; /* Tailwind mt-3 */
            margin-bottom: 0.75rem; /* Tailwind mb-2 */
            /* font-style: italic; */
            font-weight: bold;
        }

        .paper-rating {
            margin-top: 1rem; /* Tailwind mt-4 */
            margin-bottom: 1rem; /* Tailwind mb-4 */
            color: #f59e0b; /* Tailwind amber-500 */
        }

        .paper-rating i {
            margin-right: 0.125rem; /* Tailwind mr-0.5 */
        }

        /* Apply consistent star color to sub-ratings */
        .paper-sub-ratings .rating-item i {
            color: #f59e0b; /* Match overall rating star color (amber-500) */
            margin-right: 0.125rem; /* Consistent spacing */
        }

    </style>
</head>
<body class="container mx-auto px-4 antialiased">

    <motion.div
        initial="{ opacity: 0, y: -30 }"
        animate="{ opacity: 1, y: 0 }"
        transition="{ duration: 0.6, ease: 'easeOut' }"
        class="header"
        data-motion-element
    >
        <h1>AIGC Daily Papers</h1>
        <p>Daily papers related to Image/Video/Multimodal Generation from cs.CV</p>
        <p>April 30, 2025</p>
        <div class="line-graphic mt-4 mb-8 mx-auto w-1/4"></div> <!-- Added line graphic -->
    </motion.div>

    <div class="bento-grid" id="paper-grid">
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.0, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">YoChameleon: Personalized Vision and Language Generation</h2>
            <p class="paper-summary">Large Multimodal Models (e.g., GPT-4, Gemini, Chameleon) have evolved into
powerful tools with millions of users. However, they remain generic models and
lack personalized knowledge of specific user concepts. Previous work has
explored personalization for text generation, yet it remains unclear how these
methods can be adapted to new modalities, such as image generation. In this
paper, we introduce Yo'Chameleon, the first attempt to study personalization
for large multimodal models. Given 3-5 images of a particular concept,
Yo'Chameleon leverages soft-prompt tuning to embed subject-specific information
to (i) answer questions about the subject and (ii) recreate pixel-level details
to produce images of the subject in new contexts. Yo'Chameleon is trained with
(i) a self-prompting optimization mechanism to balance performance across
multiple modalities, and (ii) a ``soft-positive" image generation approach to
enhance image quality in a few-shot setting.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: yo'chameleon introduces a method to personalize large multimodal models using soft-prompt tuning and few-shot learning to answer questions and generate images of specific subjects, addressing a gap in personalized multimodal generation.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: yo'chameleon 提出了一种使用软提示调整和少样本学习个性化大型多模态模型的方法，以回答问题并生成特定对象的图像，弥补了个性化多模态生成方面的空白。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.20998v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Thao Nguyen, Krishna Kumar Singh, Jing Shi, Trung Bui, Yong Jae Lee, Yuheng Li</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.05, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">X-Fusion: Introducing New Modality to Frozen Large Language Models</h2>
            <p class="paper-summary">We propose X-Fusion, a framework that extends pretrained Large Language
Models (LLMs) for multimodal tasks while preserving their language
capabilities. X-Fusion employs a dual-tower design with modality-specific
weights, keeping the LLM's parameters frozen while integrating vision-specific
information for both understanding and generation. Our experiments demonstrate
that X-Fusion consistently outperforms alternative architectures on both
image-to-text and text-to-image tasks. We find that incorporating
understanding-focused data improves generation quality, reducing image data
noise enhances overall performance, and feature alignment accelerates
convergence for smaller models but has minimal impact on larger ones. Our
findings provide valuable insights into building efficient unified multimodal
models.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: x-fusion is a framework that extends frozen llms for multimodal tasks using modality-specific weights, achieving state-of-the-art performance in image-to-text and text-to-image generation while preserving language capabilities. the paper gives valuable insights into efficiently unified multimodal models.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: x-fusion是一种框架，它使用特定于模态的权重来扩展冻结的llm，以用于多模态任务，在图像到文本和文本到图像生成中实现了最先进的性能，同时保留了语言能力。该论文为构建高效的统一多模态模型提供了宝贵的见解。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.20996v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Sicheng Mo, Thao Nguyen, Xun Huang, Siddharth Srinivasan Iyer, Yijun Li, Yuchen Liu, Abhishek Tandon, Eli Shechtman, Krishna Kumar Singh, Yong Jae Lee, Bolei Zhou, Yuheng Li</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.1, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">TesserAct: Learning 4D Embodied World Models</h2>
            <p class="paper-summary">This paper presents an effective approach for learning novel 4D embodied
world models, which predict the dynamic evolution of 3D scenes over time in
response to an embodied agent's actions, providing both spatial and temporal
consistency. We propose to learn a 4D world model by training on RGB-DN (RGB,
Depth, and Normal) videos. This not only surpasses traditional 2D models by
incorporating detailed shape, configuration, and temporal changes into their
predictions, but also allows us to effectively learn accurate inverse dynamic
models for an embodied agent. Specifically, we first extend existing robotic
manipulation video datasets with depth and normal information leveraging
off-the-shelf models. Next, we fine-tune a video generation model on this
annotated dataset, which jointly predicts RGB-DN (RGB, Depth, and Normal) for
each frame. We then present an algorithm to directly convert generated RGB,
Depth, and Normal videos into a high-quality 4D scene of the world. Our method
ensures temporal and spatial coherence in 4D scene predictions from embodied
scenarios, enables novel view synthesis for embodied environments, and
facilitates policy learning that significantly outperforms those derived from
prior video-based world models.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces tesseract, a method for learning 4d embodied world models from rgb-dn videos, which enables improved spatial and temporal consistency compared to 2d models and facilitates better policy learning.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了 tesseract，一种通过 rgb-dn 视频学习 4d 具身世界模型的方法，与 2d 模型相比，它能够提高空间和时间一致性，并促进更好的策略学习。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.20995v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Haoyu Zhen, Qiao Sun, Hongxin Zhang, Junyan Li, Siyuan Zhou, Yilun Du, Chuang Gan</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.15000000000000002, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">UniversalRAG: Retrieval-Augmented Generation over Multiple Corpora with Diverse Modalities and Granularities</h2>
            <p class="paper-summary">Retrieval-Augmented Generation (RAG) has shown substantial promise in
improving factual accuracy by grounding model responses with external knowledge
relevant to queries. However, most existing RAG approaches are limited to a
text-only corpus, and while recent efforts have extended RAG to other
modalities such as images and videos, they typically operate over a single
modality-specific corpus. In contrast, real-world queries vary widely in the
type of knowledge they require, which a single type of knowledge source cannot
address. To address this, we introduce UniversalRAG, a novel RAG framework
designed to retrieve and integrate knowledge from heterogeneous sources with
diverse modalities and granularities. Specifically, motivated by the
observation that forcing all modalities into a unified representation space
derived from a single combined corpus causes a modality gap, where the
retrieval tends to favor items from the same modality as the query, we propose
a modality-aware routing mechanism that dynamically identifies the most
appropriate modality-specific corpus and performs targeted retrieval within it.
Also, beyond modality, we organize each modality into multiple granularity
levels, enabling fine-tuned retrieval tailored to the complexity and scope of
the query. We validate UniversalRAG on 8 benchmarks spanning multiple
modalities, showing its superiority over modality-specific and unified
baselines.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces universalrag, a retrieval-augmented generation framework that retrieves knowledge from diverse modalities and granularities, addressing the limitations of existing rag approaches that typically focus on a single modality. it uses a modality-aware routing mechanism and granularity levels for fine-tuned retrieval.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了universalrag，一个检索增强生成框架，可以从不同的模态和粒度中检索知识，解决了现有rag方法通常只关注单一模态的局限性。它使用了一种模态感知的路由机制和粒度级别，以实现细粒度的检索。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.20734v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Woongyeong Yeo, Kangsan Kim, Soyeong Jeong, Jinheon Baek, Sung Ju Hwang</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.2, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">In-Context Edit: Enabling Instructional Image Editing with In-Context Generation in Large Scale Diffusion Transformer</h2>
            <p class="paper-summary">Instruction-based image editing enables robust image modification via natural
language prompts, yet current methods face a precision-efficiency tradeoff.
Fine-tuning methods demand significant computational resources and large
datasets, while training-free techniques struggle with instruction
comprehension and edit quality. We resolve this dilemma by leveraging
large-scale Diffusion Transformer (DiT)' enhanced generation capacity and
native contextual awareness. Our solution introduces three contributions: (1)
an in-context editing framework for zero-shot instruction compliance using
in-context prompting, avoiding structural changes; (2) a LoRA-MoE hybrid tuning
strategy that enhances flexibility with efficient adaptation and dynamic expert
routing, without extensive retraining; and (3) an early filter inference-time
scaling method using vision-language models (VLMs) to select better initial
noise early, improving edit quality. Extensive evaluations demonstrate our
method's superiority: it outperforms state-of-the-art approaches while
requiring only 0.5% training data and 1% trainable parameters compared to
conventional baselines. This work establishes a new paradigm that enables
high-precision yet efficient instruction-guided editing. Codes and demos can be
found in https://river-zhang.github.io/ICEdit-gh-pages/.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper introduces an efficient and precise instruction-based image editing method called in-context edit, which leverages a diffusion transformer with in-context prompting, lora-moe tuning, and early filter inference-time scaling to outperform existing methods with significantly less training data and parameters.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文介绍了一种高效且精确的基于指令的图像编辑方法，名为 in-context edit。该方法利用扩散transformer（dit），通过上下文提示、lora-moe tuning 以及 early filter inference-time scaling，能够在显著减少训练数据和参数的情况下，超越现有的图像编辑方法。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.20690v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Zechuan Zhang, Ji Xie, Yu Lu, Zongxin Yang, Yi Yang</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.25, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Efficient Listener: Dyadic Facial Motion Synthesis via Action Diffusion</h2>
            <p class="paper-summary">Generating realistic listener facial motions in dyadic conversations remains
challenging due to the high-dimensional action space and temporal dependency
requirements. Existing approaches usually consider extracting 3D Morphable
Model (3DMM) coefficients and modeling in the 3DMM space. However, this makes
the computational speed of the 3DMM a bottleneck, making it difficult to
achieve real-time interactive responses. To tackle this problem, we propose
Facial Action Diffusion (FAD), which introduces the diffusion methods from the
field of image generation to achieve efficient facial action generation. We
further build the Efficient Listener Network (ELNet) specially designed to
accommodate both the visual and audio information of the speaker as input.
Considering of FAD and ELNet, the proposed method learns effective listener
facial motion representations and leads to improvements of performance over the
state-of-the-art methods while reducing 99% computational time.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces facial action diffusion (fad) and efficient listener network (elnet) for real-time listener facial motion generation in dyadic conversations, achieving significant speed improvements compared to existing 3dmm-based methods.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了面部动作扩散（fad）和高效监听器网络（elnet），用于在二元对话中生成实时监听者面部动作，与现有的基于3dmm的方法相比，实现了显著的速度提升。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.20685v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Zesheng Wang, Alexandre Bruckert, Patrick Le Callet, Guangtao Zhai</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.30000000000000004, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">LDPoly: Latent Diffusion for Polygonal Road Outline Extraction in Large-Scale Topographic Mapping</h2>
            <p class="paper-summary">Polygonal road outline extraction from high-resolution aerial images is an
important task in large-scale topographic mapping, where roads are represented
as vectorized polygons, capturing essential geometric features with minimal
vertex redundancy. Despite its importance, no existing method has been
explicitly designed for this task. While polygonal building outline extraction
has been extensively studied, the unique characteristics of roads, such as
branching structures and topological connectivity, pose challenges to these
methods. To address this gap, we introduce LDPoly, the first dedicated
framework for extracting polygonal road outlines from high-resolution aerial
images. Our method leverages a novel Dual-Latent Diffusion Model with a
Channel-Embedded Fusion Module, enabling the model to simultaneously generate
road masks and vertex heatmaps. A tailored polygonization method is then
applied to obtain accurate vectorized road polygons with minimal vertex
redundancy. We evaluate LDPoly on a new benchmark dataset, Map2ImLas, which
contains detailed polygonal annotations for various topographic objects in
several Dutch regions. Our experiments include both in-region and cross-region
evaluations, with the latter designed to assess the model's generalization
performance on unseen regions. Quantitative and qualitative results demonstrate
that LDPoly outperforms state-of-the-art polygon extraction methods across
various metrics, including pixel-level coverage, vertex efficiency, polygon
regularity, and road connectivity. We also design two new metrics to assess
polygon simplicity and boundary smoothness. Moreover, this work represents the
first application of diffusion models for extracting precise vectorized object
outlines without redundant vertices from remote-sensing imagery, paving the way
for future advancements in this field.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces ldpoly, a novel dual-latent diffusion model for extracting polygonal road outlines from aerial images, demonstrating superior performance on a new benchmark dataset. it pioneers the use of diffusion models for precise vectorized object outline extraction in remote sensing.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了ldpoly，一种新型双潜变量扩散模型，用于从航空图像中提取多边形道路轮廓，并在新的基准数据集上展示了卓越的性能。它开创了使用扩散模型从遥感图像中提取精确矢量化对象轮廓的先河。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.20645v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Weiqin Jiao, Hao Cheng, George Vosselman, Claudio Persello</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.35000000000000003, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Dynamic Attention Analysis for Backdoor Detection in Text-to-Image Diffusion Models</h2>
            <p class="paper-summary">Recent studies have revealed that text-to-image diffusion models are
vulnerable to backdoor attacks, where attackers implant stealthy textual
triggers to manipulate model outputs. Previous backdoor detection methods
primarily focus on the static features of backdoor samples. However, a vital
property of diffusion models is their inherent dynamism. This study introduces
a novel backdoor detection perspective named Dynamic Attention Analysis (DAA),
showing that these dynamic characteristics serve as better indicators for
backdoor detection. Specifically, by examining the dynamic evolution of
cross-attention maps, we observe that backdoor samples exhibit distinct feature
evolution patterns at the $<$EOS$>$ token compared to benign samples. To
quantify these dynamic anomalies, we first introduce DAA-I, which treats the
tokens' attention maps as spatially independent and measures dynamic feature
using the Frobenius norm. Furthermore, to better capture the interactions
between attention maps and refine the feature, we propose a dynamical
system-based approach, referred to as DAA-S. This model formulates the spatial
correlations among attention maps using a graph-based state equation and we
theoretically analyze the global asymptotic stability of this method. Extensive
experiments across five representative backdoor attack scenarios demonstrate
that our approach significantly surpasses existing detection methods, achieving
an average F1 Score of 79.49% and an AUC of 87.67%. The code is available at
https://github.com/Robin-WZQ/DAA.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper introduces a novel approach, dynamic attention analysis (daa), for detecting backdoor attacks in text-to-image diffusion models by analyzing the dynamic evolution of cross-attention maps. experiments show significant improvement over existing methods.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文提出了一种名为动态注意力分析 (daa) 的新方法，通过分析交叉注意力图的动态演变来检测文本到图像扩散模型中的后门攻击。实验表明，该方法比现有方法有显著改进。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.20518v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Zhongqi Wang, Jie Zhang, Shiguang Shan, Xilin Chen</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.4, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">LMM4Gen3DHF: Benchmarking and Evaluating Multimodal 3D Human Face Generation with LMMs</h2>
            <p class="paper-summary">The rapid advancement in generative artificial intelligence have enabled the
creation of 3D human faces (HFs) for applications including media production,
virtual reality, security, healthcare, and game development, etc. However,
assessing the quality and realism of these AI-generated 3D human faces remains
a significant challenge due to the subjective nature of human perception and
innate perceptual sensitivity to facial features. To this end, we conduct a
comprehensive study on the quality assessment of AI-generated 3D human faces.
We first introduce Gen3DHF, a large-scale benchmark comprising 2,000 videos of
AI-Generated 3D Human Faces along with 4,000 Mean Opinion Scores (MOS)
collected across two dimensions, i.e., quality and authenticity, 2,000
distortion-aware saliency maps and distortion descriptions. Based on Gen3DHF,
we propose LMME3DHF, a Large Multimodal Model (LMM)-based metric for Evaluating
3DHF capable of quality and authenticity score prediction, distortion-aware
visual question answering, and distortion-aware saliency prediction.
Experimental results show that LMME3DHF achieves state-of-the-art performance,
surpassing existing methods in both accurately predicting quality scores for
AI-generated 3D human faces and effectively identifying distortion-aware
salient regions and distortion types, while maintaining strong alignment with
human perceptual judgments. Both the Gen3DHF database and the LMME3DHF will be
released upon the publication.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces a new large-scale benchmark (gen3dhf) for evaluating ai-generated 3d human faces and proposes a large multimodal model (lmme3dhf) for assessing quality and authenticity, demonstrating state-of-the-art performance.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了一个新的大规模基准测试（gen3dhf），用于评估人工智能生成的3d人脸，并提出了一个大型多模态模型（lmme3dhf）来评估质量和真实性，展示了最先进的性能。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.20466v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Woo Yi Yang, Jiarui Wang, Sijing Wu, Huiyu Duan, Yuxin Zhu, Liu Yang, Kang Fu, Guangtao Zhai, Xiongkuo Min</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.45, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Inception: Jailbreak the Memory Mechanism of Text-to-Image Generation Systems</h2>
            <p class="paper-summary">Currently, the memory mechanism has been widely and successfully exploited in
online text-to-image (T2I) generation systems ($e.g.$, DALL$\cdot$E 3) for
alleviating the growing tokenization burden and capturing key information in
multi-turn interactions. Despite its practicality, its security analyses have
fallen far behind. In this paper, we reveal that this mechanism exacerbates the
risk of jailbreak attacks. Different from previous attacks that fuse the unsafe
target prompt into one ultimate adversarial prompt, which can be easily
detected or may generate non-unsafe images due to under- or over-optimization,
we propose Inception, the first multi-turn jailbreak attack against the memory
mechanism in real-world text-to-image generation systems. Inception embeds the
malice at the inception of the chat session turn by turn, leveraging the
mechanism that T2I generation systems retrieve key information in their memory.
Specifically, Inception mainly consists of two modules. It first segments the
unsafe prompt into chunks, which are subsequently fed to the system in multiple
turns, serving as pseudo-gradients for directive optimization. Specifically, we
develop a series of segmentation policies that ensure the images generated are
semantically consistent with the target prompt. Secondly, after segmentation,
to overcome the challenge of the inseparability of minimum unsafe words, we
propose recursion, a strategy that makes minimum unsafe words subdivisible.
Collectively, segmentation and recursion ensure that all the request prompts
are benign but can lead to malicious outcomes. We conduct experiments on the
real-world text-to-image generation system ($i.e.$, DALL$\cdot$E 3) to validate
the effectiveness of Inception. The results indicate that Inception surpasses
the state-of-the-art by a 14\% margin in attack success rate.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces 'inception,' a novel multi-turn jailbreak attack against memory mechanisms in text-to-image generation systems (dall·e 3), achieving a 14% improvement in attack success rate compared to state-of-the-art methods.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文介绍了一种名为“inception”的新型多轮越狱攻击，针对文本到图像生成系统（dall·e 3）中的记忆机制，与现有技术相比，攻击成功率提高了 14%。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.20376v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Shiqian Zhao, Jiayang Liu, Yiming Li, Runyi Hu, Xiaojun Jia, Wenshu Fan, Xinfeng Li, Jie Zhang, Wei Dong, Tianwei Zhang, Luu Anh Tuan</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.5, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">A Picture is Worth a Thousand Prompts? Efficacy of Iterative Human-Driven Prompt Refinement in Image Regeneration Tasks</h2>
            <p class="paper-summary">With AI-generated content becoming ubiquitous across the web, social media,
and other digital platforms, it is vital to examine how such content are
inspired and generated. The creation of AI-generated images often involves
refining the input prompt iteratively to achieve desired visual outcomes. This
study focuses on the relatively underexplored concept of image regeneration
using AI, in which a human operator attempts to closely recreate a specific
target image by iteratively refining their prompt. Image regeneration is
distinct from normal image generation, which lacks any predefined visual
reference. A separate challenge lies in determining whether existing image
similarity metrics (ISMs) can provide reliable, objective feedback in iterative
workflows, given that we do not fully understand if subjective human judgments
of similarity align with these metrics. Consequently, we must first validate
their alignment with human perception before assessing their potential as a
feedback mechanism in the iterative prompt refinement process. To address these
research gaps, we present a structured user study evaluating how iterative
prompt refinement affects the similarity of regenerated images relative to
their targets, while also examining whether ISMs capture the same improvements
perceived by human observers. Our findings suggest that incremental prompt
adjustments substantially improve alignment, verified through both subjective
evaluations and quantitative measures, underscoring the broader potential of
iterative workflows to enhance generative AI content creation across various
application domains.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper investigates iterative prompt refinement for ai image regeneration (recreating a target image) and evaluates the alignment between image similarity metrics (isms) and human perception of similarity improvements during this process, finding that iterative refinement improves alignment based on both subjective and quantitative measures.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文研究了ai图像再生（重新创建目标图像）的迭代提示细化方法，并评估了图像相似性指标（ism）与人类对迭代过程中相似性提高的感知之间的一致性，发现基于主观和定量指标，迭代细化都能改进一致性。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.20340v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Khoi Trinh, Scott Seidenberger, Raveen Wijewickrama, Murtuza Jadliwala, Anindya Maiti</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.55, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Image Interpolation with Score-based Riemannian Metrics of Diffusion Models</h2>
            <p class="paper-summary">Diffusion models excel in content generation by implicitly learning the data
manifold, yet they lack a practical method to leverage this manifold - unlike
other deep generative models equipped with latent spaces. This paper introduces
a novel framework that treats the data space of pre-trained diffusion models as
a Riemannian manifold, with a metric derived from the score function.
Experiments with MNIST and Stable Diffusion show that this geometry-aware
approach yields image interpolations that are more realistic, less noisy, and
more faithful to prompts than existing methods, demonstrating its potential for
improved content generation and editing.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper introduces a riemannian manifold framework utilizing the score function of diffusion models to achieve geometry-aware image interpolation, resulting in more realistic and faithful outputs. it unlocks practical applications based on the data manifold learned by the diffusion model.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文提出了一种黎曼流形框架，利用扩散模型的得分函数实现几何感知的图像插值，从而产生更逼真和真实的输出。它解锁了基于扩散模型学习的数据流形的实际应用。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.20288v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Shinnosuke Saito, Takashi Matsubara</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.6000000000000001, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Physics-Informed Diffusion Models for SAR Ship Wake Generation from Text Prompts</h2>
            <p class="paper-summary">Detecting ship presence via wake signatures in SAR imagery is attracting
considerable research interest, but limited annotated data availability poses
significant challenges for supervised learning. Physics-based simulations are
commonly used to address this data scarcity, although they are slow and
constrain end-to-end learning. In this work, we explore a new direction for
more efficient and end-to-end SAR ship wake simulation using a diffusion model
trained on data generated by a physics-based simulator. The training dataset is
built by pairing images produced by the simulator with text prompts derived
from simulation parameters. Experimental result show that the model generates
realistic Kelvin wake patterns and achieves significantly faster inference than
the physics-based simulator. These results highlight the potential of diffusion
models for fast and controllable wake image generation, opening new
possibilities for end-to-end downstream tasks in maritime SAR analysis.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper introduces a physics-informed diffusion model for generating sar ship wake images from text prompts, offering a faster alternative to physics-based simulators and enabling end-to-end learning for maritime sar analysis.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文介绍了一种物理信息引导的扩散模型，用于从文本提示生成 sar 船舶尾流图像，为基于物理的模拟器提供了一种更快的替代方案，并支持海事 sar 分析的端到端学习。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.20241v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Kamirul Kamirul, Odysseas Pappas, Alin Achim</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.65, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Integration Flow Models</h2>
            <p class="paper-summary">Ordinary differential equation (ODE) based generative models have emerged as
a powerful approach for producing high-quality samples in many applications.
However, the ODE-based methods either suffer the discretization error of
numerical solvers of ODE, which restricts the quality of samples when only a
few NFEs are used, or struggle with training instability. In this paper, we
proposed Integration Flow, which directly learns the integral of ODE-based
trajectory paths without solving the ODE functions. Moreover, Integration Flow
explicitly incorporates the target state $\mathbf{x}_0$ as the anchor state in
guiding the reverse-time dynamics. We have theoretically proven this can
contribute to both stability and accuracy. To the best of our knowledge,
Integration Flow is the first model with a unified structure to estimate
ODE-based generative models and the first to show the exact straightness of
1-Rectified Flow without reflow. Through theoretical analysis and empirical
evaluations, we show that Integration Flows achieve improved performance when
it is applied to existing ODE-based models, such as diffusion models, Rectified
Flows, and PFGM++. Specifically, Integration Flow achieves one-step generation
on CIFAR10 with FIDs of 2.86 for the Variance Exploding (VE) diffusion model,
3.36 for rectified flow without reflow, and 2.91 for PFGM++; and on ImageNet
with FIDs of 4.09 for VE diffusion model, 4.35 for rectified flow without
reflow and 4.15 for PFGM++.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces 'integration flow,' a novel approach to improve ode-based generative models by directly learning the integral of trajectory paths, leading to enhanced stability, accuracy, and one-step generation performance across various models like diffusion models and rectified flows.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了“积分流”（integration flow），一种通过直接学习轨迹路径的积分来改进基于常微分方程（ode）的生成模型的新方法，从而提高了稳定性和准确性，并在扩散模型和纠正流等多种模型中实现了单步生成性能。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(8/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.20179v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Jingjing Wang, Dan Zhang, Joshua Luo, Yin Yang, Feng Luo</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.7000000000000001, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">CBM-RAG: Demonstrating Enhanced Interpretability in Radiology Report Generation with Multi-Agent RAG and Concept Bottleneck Models</h2>
            <p class="paper-summary">Advancements in generative Artificial Intelligence (AI) hold great promise
for automating radiology workflows, yet challenges in interpretability and
reliability hinder clinical adoption. This paper presents an automated
radiology report generation framework that combines Concept Bottleneck Models
(CBMs) with a Multi-Agent Retrieval-Augmented Generation (RAG) system to bridge
AI performance with clinical explainability. CBMs map chest X-ray features to
human-understandable clinical concepts, enabling transparent disease
classification. Meanwhile, the RAG system integrates multi-agent collaboration
and external knowledge to produce contextually rich, evidence-based reports.
Our demonstration showcases the system's ability to deliver interpretable
predictions, mitigate hallucinations, and generate high-quality, tailored
reports with an interactive interface addressing accuracy, trust, and usability
challenges. This framework provides a pathway to improving diagnostic
consistency and empowering radiologists with actionable insights.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper introduces a framework combining concept bottleneck models (cbms) with a multi-agent retrieval-augmented generation (rag) system for interpretable radiology report generation, aiming to improve diagnostic consistency and provide actionable insights for radiologists.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文介绍了一个框架，该框架结合概念颈瓶模型（cbm）和多代理检索增强生成（rag）系统，用于生成可解释的放射学报告，旨在提高诊断一致性并为放射科医生提供可操作的见解。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(7/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.20898v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Hasan Md Tusfiqur Alam, Devansh Srivastav, Abdulrahman Mohamed Selim, Md Abdul Kadir, Md Moktadiurl Hoque Shuvo, Daniel Sonntag</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.75, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">PixelHacker: Image Inpainting with Structural and Semantic Consistency</h2>
            <p class="paper-summary">Image inpainting is a fundamental research area between image editing and
image generation. Recent state-of-the-art (SOTA) methods have explored novel
attention mechanisms, lightweight architectures, and context-aware modeling,
demonstrating impressive performance. However, they often struggle with complex
structure (e.g., texture, shape, spatial relations) and semantics (e.g., color
consistency, object restoration, and logical correctness), leading to artifacts
and inappropriate generation. To address this challenge, we design a simple yet
effective inpainting paradigm called latent categories guidance, and further
propose a diffusion-based model named PixelHacker. Specifically, we first
construct a large dataset containing 14 million image-mask pairs by annotating
foreground and background (potential 116 and 21 categories, respectively).
Then, we encode potential foreground and background representations separately
through two fixed-size embeddings, and intermittently inject these features
into the denoising process via linear attention. Finally, by pre-training on
our dataset and fine-tuning on open-source benchmarks, we obtain PixelHacker.
Extensive experiments show that PixelHacker comprehensively outperforms the
SOTA on a wide range of datasets (Places2, CelebA-HQ, and FFHQ) and exhibits
remarkable consistency in both structure and semantics. Project page at
https://hustvl.github.io/projects/PixelHacker.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: pixelhacker, a diffusion-based image inpainting model, uses latent category guidance and a pre-trained dataset of image-mask pairs to achieve state-of-the-art performance with improved structural and semantic consistency.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: pixelhacker是一个基于扩散模型的图像修复模型，它使用潜在类别引导和一个预训练的图像-掩码对数据集，实现了最先进的性能，并改进了结构和语义的一致性。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(7/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.20438v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Ziyang Xu, Kangsheng Duan, Xiaolei Shen, Zhifeng Ding, Wenyu Liu, Xiaohu Ruan, Xiaoxin Chen, Xinggang Wang</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.8, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">GarmentX: Autoregressive Parametric Representations for High-Fidelity 3D Garment Generation</h2>
            <p class="paper-summary">This work presents GarmentX, a novel framework for generating diverse,
high-fidelity, and wearable 3D garments from a single input image. Traditional
garment reconstruction methods directly predict 2D pattern edges and their
connectivity, an overly unconstrained approach that often leads to severe
self-intersections and physically implausible garment structures. In contrast,
GarmentX introduces a structured and editable parametric representation
compatible with GarmentCode, ensuring that the decoded sewing patterns always
form valid, simulation-ready 3D garments while allowing for intuitive
modifications of garment shape and style. To achieve this, we employ a masked
autoregressive model that sequentially predicts garment parameters, leveraging
autoregressive modeling for structured generation while mitigating
inconsistencies in direct pattern prediction. Additionally, we introduce
GarmentX dataset, a large-scale dataset of 378,682 garment parameter-image
pairs, constructed through an automatic data generation pipeline that
synthesizes diverse and high-quality garment images conditioned on parametric
garment representations. Through integrating our method with GarmentX dataset,
we achieve state-of-the-art performance in geometric fidelity and input image
alignment, significantly outperforming prior approaches. We will release
GarmentX dataset upon publication.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: garmentx introduces a novel autoregressive framework for generating high-fidelity, wearable 3d garments from images, using a structured parametric representation and a large-scale dataset, achieving state-of-the-art performance.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: garmentx 提出了一个新颖的自回归框架，用于从图像生成高保真、可穿戴的 3d 服装。该框架使用结构化的参数化表示和一个大规模数据集，并实现了最先进的性能。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(7/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.20409v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Jingfeng Guo, Jinnan Chen, Weikai Chen, Zhenyu Sun, Lanjiong Li, Baozhu Zhao, Lingting Zhu, Xin Wang, Qi Liu</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.8500000000000001, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">MicarVLMoE: A Modern Gated Cross-Aligned Vision-Language Mixture of Experts Model for Medical Image Captioning and Report Generation</h2>
            <p class="paper-summary">Medical image reporting (MIR) aims to generate structured clinical
descriptions from radiological images. Existing methods struggle with
fine-grained feature extraction, multimodal alignment, and generalization
across diverse imaging types, often relying on vanilla transformers and
focusing primarily on chest X-rays. We propose MicarVLMoE, a vision-language
mixture-of-experts model with gated cross-aligned fusion, designed to address
these limitations. Our architecture includes: (i) a multiscale vision encoder
(MSVE) for capturing anatomical details at varying resolutions, (ii) a
multihead dual-branch latent attention (MDLA) module for vision-language
alignment through latent bottleneck representations, and (iii) a modulated
mixture-of-experts (MoE) decoder for adaptive expert specialization. We extend
MIR to CT scans, retinal imaging, MRI scans, and gross pathology images,
reporting state-of-the-art results on COVCTR, MMR, PGROSS, and ROCO datasets.
Extensive experiments and ablations confirm improved clinical accuracy,
cross-modal alignment, and model interpretability. Code is available at
https://github.com/AI-14/micar-vl-moe.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces micarvlmoe, a vision-language mixture-of-experts model for medical image report generation, achieving state-of-the-art results across multiple medical imaging modalities and datasets.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了 micarvlmoe，一种用于医学图像报告生成的视觉-语言混合专家模型，在多种医学成像模式和数据集上取得了最先进的结果。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(7/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.20343v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Amaan Izhar, Nurul Japar, Norisma Idris, Ting Dang</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.9, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Weaving Context Across Images: Improving Vision-Language Models through Focus-Centric Visual Chains</h2>
            <p class="paper-summary">Vision-language models (VLMs) achieve remarkable success in single-image
tasks. However, real-world scenarios often involve intricate multi-image
inputs, leading to a notable performance decline as models struggle to
disentangle critical information scattered across complex visual features. In
this work, we propose Focus-Centric Visual Chain, a novel paradigm that
enhances VLMs'perception, comprehension, and reasoning abilities in multi-image
scenarios. To facilitate this paradigm, we propose Focus-Centric Data
Synthesis, a scalable bottom-up approach for synthesizing high-quality data
with elaborate reasoning paths. Through this approach, We construct VISC-150K,
a large-scale dataset with reasoning data in the form of Focus-Centric Visual
Chain, specifically designed for multi-image tasks. Experimental results on
seven multi-image benchmarks demonstrate that our method achieves average
performance gains of 3.16% and 2.24% across two distinct model architectures,
without compromising the general vision-language capabilities. our study
represents a significant step toward more robust and capable vision-language
systems that can handle complex visual scenarios.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces a novel focus-centric visual chain paradigm and a corresponding dataset (visc-150k) to improve vlms' performance on multi-image tasks, showing performance gains on multiple benchmarks.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文提出了一种新颖的焦点中心视觉链范式和一个相应的数据集（visc-150k），以提高视觉语言模型在多图像任务上的性能，并在多个基准测试中显示出了性能提升。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(7/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.20199v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Juntian Zhang, Chuanqi cheng, Yuhan Liu, Wei Liu, Jian Luan, Rui Yan</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 0.9500000000000001, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">Antidote: A Unified Framework for Mitigating LVLM Hallucinations in Counterfactual Presupposition and Object Perception</h2>
            <p class="paper-summary">Large Vision-Language Models (LVLMs) have achieved impressive results across
various cross-modal tasks. However, hallucinations, i.e., the models generating
counterfactual responses, remain a challenge. Though recent studies have
attempted to alleviate object perception hallucinations, they focus on the
models' response generation, and overlooking the task question itself. This
paper discusses the vulnerability of LVLMs in solving counterfactual
presupposition questions (CPQs), where the models are prone to accept the
presuppositions of counterfactual objects and produce severe hallucinatory
responses. To this end, we introduce "Antidote", a unified, synthetic
data-driven post-training framework for mitigating both types of hallucination
above. It leverages synthetic data to incorporate factual priors into questions
to achieve self-correction, and decouple the mitigation process into a
preference optimization problem. Furthermore, we construct "CP-Bench", a novel
benchmark to evaluate LVLMs' ability to correctly handle CPQs and produce
factual responses. Applied to the LLaVA series, Antidote can simultaneously
enhance performance on CP-Bench by over 50%, POPE by 1.8-3.3%, and CHAIR & SHR
by 30-50%, all without relying on external supervision from stronger LVLMs or
human feedback and introducing noticeable catastrophic forgetting issues.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces "antidote," a post-training framework and a new benchmark "cp-bench" to mitigate hallucinations in lvlms, specifically in counterfactual presupposition and object perception tasks, through synthetic data and preference optimization.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了 "antidote"，一个后训练框架和一个新的基准 "cp-bench"，旨在通过合成数据和偏好优化来减轻 lvlm 中的幻觉，特别是在反事实预设和对象感知任务中。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(4/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(6/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.20468v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Yuanchen Wu, Lu Zhang, Hang Yao, Junlong Du, Ke Yan, Shouhong Ding, Yunsheng Wu, Xiaoqiang Li</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 1.0, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">ChestX-Reasoner: Advancing Radiology Foundation Models with Reasoning through Step-by-Step Verification</h2>
            <p class="paper-summary">Recent advances in reasoning-enhanced large language models (LLMs) and
multimodal LLMs (MLLMs) have significantly improved performance in complex
tasks, yet medical AI models often overlook the structured reasoning processes
inherent in clinical practice. In this work, we present ChestX-Reasoner, a
radiology diagnosis MLLM designed to leverage process supervision mined
directly from clinical reports, reflecting the step-by-step reasoning followed
by radiologists. We construct a large dataset by extracting and refining
reasoning chains from routine radiology reports. Our two-stage training
framework combines supervised fine-tuning and reinforcement learning guided by
process rewards to better align model reasoning with clinical standards. We
introduce RadRBench-CXR, a comprehensive benchmark featuring 59K visual
question answering samples with 301K clinically validated reasoning steps, and
propose RadRScore, a metric evaluating reasoning factuality, completeness, and
effectiveness. ChestX-Reasoner outperforms existing medical and general-domain
MLLMs in both diagnostic accuracy and reasoning ability, achieving 16%, 5.9%,
and 18% improvements in reasoning ability compared to the best medical MLLM,
the best general MLLM, and its base model, respectively, as well as 3.3%, 24%,
and 27% improvements in outcome accuracy. All resources are open-sourced to
facilitate further research in medical reasoning MLLMs.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: chestx-reasoner introduces a radiology diagnosis mllm trained with process supervision from clinical reports, along with a new benchmark and metric for evaluating reasoning in medical ai, demonstrating significant improvements in diagnostic accuracy and reasoning ability.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: chestx-reasoner 引入了一个利用临床报告进行过程监督训练的放射诊断多模态大语言模型，以及一个新的用于评估医学人工智能推理能力的基准和指标，在诊断准确性和推理能力方面表现出显著的改进。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(2/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(5/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.20930v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Ziqing Fan, Cheng Liang, Chaoyi Wu, Ya Zhang, Yanfeng Wang, Weidi Xie</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 1.05, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">TTTFusion: A Test-Time Training-Based Strategy for Multimodal Medical Image Fusion in Surgical Robots</h2>
            <p class="paper-summary">With the increasing use of surgical robots in clinical practice, enhancing
their ability to process multimodal medical images has become a key research
challenge. Although traditional medical image fusion methods have made progress
in improving fusion accuracy, they still face significant challenges in
real-time performance, fine-grained feature extraction, and edge
preservation.In this paper, we introduce TTTFusion, a Test-Time Training
(TTT)-based image fusion strategy that dynamically adjusts model parameters
during inference to efficiently fuse multimodal medical images. By adapting the
model during the test phase, our method optimizes the parameters based on the
input image data, leading to improved accuracy and better detail preservation
in the fusion results.Experimental results demonstrate that TTTFusion
significantly enhances the fusion quality of multimodal images compared to
traditional fusion methods, particularly in fine-grained feature extraction and
edge preservation. This approach not only improves image fusion accuracy but
also offers a novel technical solution for real-time image processing in
surgical robots.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces tttfusion, a test-time training-based method for multimodal medical image fusion in surgical robots, improving accuracy and detail preservation during real-time image processing.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了tttfusion，一种基于测试时训练的多模态医学图像融合方法，应用于手术机器人，旨在提高实时图像处理过程中的准确性和细节保持。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(3/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i>
                    <span class="text-xs text-gray-500 ml-1">(9/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(5/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.20362v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Qinhua Xie, Hao Tang</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 1.1, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">A Transformer-based Multimodal Fusion Model for Efficient Crowd Counting Using Visual and Wireless Signals</h2>
            <p class="paper-summary">Current crowd-counting models often rely on single-modal inputs, such as
visual images or wireless signal data, which can result in significant
information loss and suboptimal recognition performance. To address these
shortcomings, we propose TransFusion, a novel multimodal fusion-based
crowd-counting model that integrates Channel State Information (CSI) with image
data. By leveraging the powerful capabilities of Transformer networks,
TransFusion effectively combines these two distinct data modalities, enabling
the capture of comprehensive global contextual information that is critical for
accurate crowd estimation. However, while transformers are well capable of
capturing global features, they potentially fail to identify finer-grained,
local details essential for precise crowd counting. To mitigate this, we
incorporate Convolutional Neural Networks (CNNs) into the model architecture,
enhancing its ability to extract detailed local features that complement the
global context provided by the Transformer. Extensive experimental evaluations
demonstrate that TransFusion achieves high accuracy with minimal counting
errors while maintaining superior efficiency.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces transfusion, a novel crowd-counting model that fuses visual and wireless signals using a transformer network enhanced with cnns for improved accuracy and efficiency.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了一种名为transfusion的新型人群计数模型，该模型使用transformer网络融合视觉和无线信号，并结合cnn，以提高准确性和效率。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(4/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(5/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.20178v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Zhe Cui, Yuli Li, Le-Nam Tran</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 1.1500000000000001, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">FLIM-based Salient Object Detection Networks with Adaptive Decoders</h2>
            <p class="paper-summary">Salient Object Detection (SOD) methods can locate objects that stand out in
an image, assign higher values to their pixels in a saliency map, and binarize
the map outputting a predicted segmentation mask. A recent tendency is to
investigate pre-trained lightweight models rather than deep neural networks in
SOD tasks, coping with applications under limited computational resources. In
this context, we have investigated lightweight networks using a methodology
named Feature Learning from Image Markers (FLIM), which assumes that the
encoder's kernels can be estimated from marker pixels on discriminative regions
of a few representative images. This work proposes flyweight networks, hundreds
of times lighter than lightweight models, for SOD by combining a FLIM encoder
with an adaptive decoder, whose weights are estimated for each input image by a
given heuristic function. Such FLIM networks are trained from three to four
representative images only and without backpropagation, making the models
suitable for applications under labeled data constraints as well. We study five
adaptive decoders; two of them are introduced here. Differently from the
previous ones that rely on one neuron per pixel with shared weights, the
heuristic functions of the new adaptive decoders estimate the weights of each
neuron per pixel. We compare FLIM models with adaptive decoders for two
challenging SOD tasks with three lightweight networks from the
state-of-the-art, two FLIM networks with decoders trained by backpropagation,
and one FLIM network whose labeled markers define the decoder's weights. The
experiments demonstrate the advantages of the proposed networks over the
baselines, revealing the importance of further investigating such methods in
new applications.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: this paper introduces flyweight networks for salient object detection (sod) using feature learning from image markers (flim) and adaptive decoders, trained on limited data and without backpropagation. the proposed networks outperform existing lightweight models while being significantly smaller.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 本文介绍了一种用于显著目标检测（sod）的轻量级网络，该网络使用图像标记特征学习（flim）和自适应解码器，在有限的数据上训练且无需反向传播。所提出的网络在性能上优于现有的轻量级模型，且体积更小。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(2/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(4/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.20872v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Gilson Junior Soares, Matheus Abrantes Cerqueira, Jancarlo F. Gomes, Laurent Najman, Silvio Jamil F. Guimarães, Alexandre Xavier Falcão</p>
            
        </motion.div>
        
        <motion.div
            initial="{ opacity: 0, y: 50, scale: 0.9 }"
            whileInView="{ opacity: 1, y: 0, scale: 1 }"
            viewport="{ once: true, amount: 0.2 }" /* Trigger when 20% is visible */
            transition="{ duration: 0.5, delay: 1.2000000000000002, ease: 'easeOut' }"  
            class="bento-item"
            data-motion-element
        >
            <h2 class="paper-title">FBRT-YOLO: Faster and Better for Real-Time Aerial Image Detection</h2>
            <p class="paper-summary">Embedded flight devices with visual capabilities have become essential for a
wide range of applications. In aerial image detection, while many existing
methods have partially addressed the issue of small target detection,
challenges remain in optimizing small target detection and balancing detection
accuracy with efficiency. These issues are key obstacles to the advancement of
real-time aerial image detection. In this paper, we propose a new family of
real-time detectors for aerial image detection, named FBRT-YOLO, to address the
imbalance between detection accuracy and efficiency. Our method comprises two
lightweight modules: Feature Complementary Mapping Module (FCM) and
Multi-Kernel Perception Unit(MKP), designed to enhance object perception for
small targets in aerial images. FCM focuses on alleviating the problem of
information imbalance caused by the loss of small target information in deep
networks. It aims to integrate spatial positional information of targets more
deeply into the network,better aligning with semantic information in the deeper
layers to improve the localization of small targets. We introduce MKP, which
leverages convolutions with kernels of different sizes to enhance the
relationships between targets of various scales and improve the perception of
targets at different scales. Extensive experimental results on three major
aerial image datasets, including Visdrone, UAVDT, and AI-TOD,demonstrate that
FBRT-YOLO outperforms various real-time detectors in terms of performance and
speed.</p>
            
            <p class="paper-tldr"><strong>TLDR</strong>: the paper introduces fbrt-yolo, a new real-time object detector optimized for aerial images, using feature complementary mapping and a multi-kernel perception unit to improve small object detection accuracy and efficiency.</p>
            
            
            <p class="paper-tldr"><strong>TLDR</strong>: 该论文介绍了fbrt-yolo，一种新的实时目标检测器，针对航空图像进行了优化，使用特征互补映射和多核感知单元，以提高小目标检测的准确性和效率。</p>
            

            
            
            <div class="paper-sub-ratings" style="display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 5px; font-size: 0.8em;">
                
                <div class="rating-item">
                    <span class="rating-label">Relevance:</span>
                    
                    <i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(2/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Novelty:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star-half-alt"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(7/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Clarity:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(8/10)</span>
                </div>
                
                
                <div class="rating-item">
                    <span class="rating-label">Potential Impact:</span>
                    
                    <i class="fas fa-star"></i><i class="fas fa-star"></i><i class="fas fa-star"></i><i class="far fa-star"></i><i class="far fa-star"></i>
                    <span class="text-xs text-gray-500 ml-1">(6/10)</span>
                </div>
                
            </div>
            
            

            
            <div class="paper-rating">
                <span class="rating-label" style="color: #000; font-weight: bold;">Overall:</span>
                
                
                    
                        <i class="fas fa-star"></i>
                    
                
                    
                        <i class="fas fa-star-half-alt"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                    
                        <i class="far fa-star"></i>
                    
                
                <span class="text-xs text-gray-500 ml-1">(3/10)</span>
            </div>
            

            <a href="http://arxiv.org/abs/2504.20670v1" target="_blank" class="paper-link">
                <i class="fas fa-file-pdf mr-1"></i> Read Paper (PDF)
            </a>
            
            <p class="paper-authors">Authors: Yao Xiao, Tingfa Xu, Yu Xin, Jianan Li</p>
            
        </motion.div>
        
    </div>

    <footer class="footer">
        Generated on 2025-05-02 04:29:18 UTC. Powered by <a href="https://github.com/onion-liu" target="_blank">onion-liu</a>.
    </footer>

</body>
</html>